Nowadays, a project usually generates a huge amount of documents, and this study designed a topic program based on topic model for project management. Topic model is a machine learning algorithm; it assumes that the documents are the distribution of latent topics and the topics are the distribution of words. The program applies the Latent Dirichlet Allocation (LDA), a popular topic model algorithm, to build the topic model and applies the “prototypicaltext based interpretation” (PTBI) and the visualisation of PyLDAvis to identify the salient topics, the prototypical paragraphs as well as the minimum number of texts for topic interpretation. In this executive summary, I will show you how the program works step by step.
Install the libraries below. Download the two css files from https://github.com/suhao3123/CSS, create a folder named assets in the root of your app directory and include the two files in that folder to lauch the Dashboard we created in the final section.
# pip install numpy # (install numpy)
# pip intall pandas # (install pandas)
# pip install PyMuPDF # (install PyMuPDF for extracting info from PDF files)
# pip install tika # (install tika for extracting paragraphs from PDF files)
# pip install spacy==2.2.0 # (install spacy for lemmatization)
# conda install gensim # (intall gesim for topic modelling)
# pip install pyLDAvis # (install pyLDAvis for topic visulisation)
# conda install -c conda-forge pyldavis # (if you use aconda to install pyLADvis)
# pip install plotly # (install plotly for visualisation)
import pandas as pd
import numpy as np
import re
# glob for extracting the directories of metadata
import glob
# PyMuPDF
import fitz
# tika
import tika
from tika import parser
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
# spacy for lemmatization
import spacy
# Visualisation
import plotly.express as px
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
%matplotlib inline
# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
import os
Input the directory of the pdf files you want to analyse, the chunks below will extract the texts and info of the files.
# Extract the directories of the PDF files, make sure the folder name does not contain number
pdf_dir = "D:\LEON\Business Analytics\Study\9. Business Project\Data set\Olympics"
pdf_files = glob.glob("%s/*.pdf" % pdf_dir)
pdf_files[:1]
# Use PyMuPDF to extract all info of the PDF files (text, title, date, etc)
list_metadata = []
for i in pdf_files:
with fitz.open(i) as doc:
info = doc.metadata
info['file_name'] = os.path.basename(i)
text = ''
for page in doc:
text+= page.getText()
info['Content'] = text
list_metadata.append(info)
df = pd.DataFrame(list_metadata)
df['document_id'] = df.index
df = df.drop_duplicates(subset = ['Content']) # drop duplicate rows
#df = df.dropna(subset=df.columns[[12]], how='any') # drop rows whose text content is NaN
#df['Word_count'] = df ['Content'].str.count(' ') + 1
df.head(3)
# check if there are documents with few words
#min_word_count= 10 # set the threshold of the minimum word count of each document
#min_word_count_filter = df['Word_count'] <= min_word_count
#df_few_words = df[min_word_count_filter][['file_name', 'Content']]
#df_few_words
df.info()
# Word count
#df['Word_count'].sum( )
The texts extracted above will be split into individual words.
# if it does not work, try to install the lastest version of pandas
data = df[df.columns[12]].tolist()
def sent_to_words(sentences):
for sentence in sentences:
yield(gensim.utils.simple_preprocess(str(sentence).encode('utf-8'), deacc=True)) # deacc=True removes punctuations
data_words= list(sent_to_words(data))
First, the stopwords will be removed and users can add more stop words manually. Next, the bigrams (phrases containing two words) and trigrams (phrases containing three words) will be formed, then the words will be lemmitised (reducing different forms of a word into a single word). Next, a threshold allows users to remove short words.
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
# import the stop_words from gensim
from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS
stop_words = [i for i in STOPWORDS]
# add more stop words after analysing the key words of each topic from pyLDAvis in section 5.2. Topic visualisation
new_stop_words = ['go', 'would', 'make', 'think', 'take', 'say', 'need', 'want', 'thing', 'have', 'lot', 'people', 'year','good','great','able','come','look','right',
'sure', 'day', 'moment', 'work','time', 'know', 'use', 'try', 'happen', 'ask', 'new', 'way', 'jonathan_stephen', 'david_higgin', 'dame_helen_ghosh','end']
stop_words.extend(new_stop_words)
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stop_words(texts):
return [[word for word in doc if word not in stop_words] for doc in texts]
def make_bigrams(texts):
return [bigram_mod[doc] for doc in texts]
def make_trigrams(texts):
return [trigram_mod[bigram_mod[doc]] for doc in texts]
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
"""https://spacy.io/api/annotation"""
texts_out = []
for sent in texts:
doc = nlp(" ".join(sent))
texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
return texts_out
# Form Trigrams
data_words_trigrams = make_trigrams(data_words)
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
#increase the maximum length of text that the parser or NER can process
nlp.max_length = 13000000 #
# Do lemmatization keeping only noun, adj, verb
data_lemmatized1 = lemmatization(data_words_trigrams, allowed_postags=['NOUN', 'ADJ', 'VERB'])
# Set a threshold for removing the words with length less than the threshold
minimum_len = 3
data_lemmatized2 = []
for i in data_lemmatized1:
new_element = [x for x in i if len(x) >= minimum_len]
data_lemmatized2.append(new_element)
# remove stop words
data_lemmatized = remove_stop_words(data_lemmatized2)
The processed words will be inputted to generate the Dictionary and Corpus to build the topic model. The Dictionary assigns an ID (0, 1, 2, etc.) to each word; the Corpus is a list of (word ID, word frequency) of each document. We can set two parameters to filter out more stopwords as shown below.
# Create Dictionary, set the parameters to filter out tokens in the dictionary by their frequency
no_below = 5 # remove the tokens less frequent than no_below documents (absolute number)
no_above = 0.85 # remove the tokens more frequent than no_above documents (fraction of the total corpus size)
id2word = corpora.Dictionary(data_lemmatized)
id2word.filter_extremes(no_below = no_below, no_above = no_above)
# print the number of reserved unique tokens and word count afer removal of high and low frequency words
print('After removal of high and low frequency words - Number of unique tokens: %d, %d' % (len(id2word),id2word.num_pos))
# Create Corpus
texts = data_lemmatized
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
Now we can input the Dictionnay and Corpus to bulid the LDA model, a basic and widely-used topic model. We might need to tune the parameters and hyperparameters to get a higher coherence score, a measure evaluating the interpretability of the topics extracted.
First we set the training parameters and hyperameters.
# set training parameters and hyperameters
k = 20 # number of topics
passes = 20 # number of training iterations through the corpus
iterations = 100 # maximum number of iterations through the corpus, limiting this parameter might cause some documents not to converge in time
alpha = 50.0/k # document-topic density, a high α tends to return more salient topics in each document
eta = 0.01 # prior probabilities assigned to each term
random_state = 12345 # random seed for reproducibility
minimum_probability = 0 # topics with a probability lower than this threshold will be filtered out
Now, we need to plot the coherence score against k to identify the opitmal k where the coherence socre reaches the highest point. Because running it is quite time-consuming, I stopped some chunks below and just set k to be 10 based on the analysis of the reuslt. If users want to fit the model to the other corpus, they can remove the hashs to reactivate the chunks and analyse the coherence socres aginst k.
#start=1; limit=21; step=1 # set the parameters to generate a sequence of k values starting with "start" and ending in "limit" by a step of "step" f
#coherence_values = []
#model_list = []
#for i in range(start,limit,step):
#model = gensim.models.LdaModel(corpus = corpus,id2word = id2word,alpha = alpha,eta = eta,iterations = iterations,num_topics = i,passes = passes,random_state = 12345,minimum_probability = minimum_probability)
#model_list.append(model)
#coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=id2word, coherence='c_v')
#coherence_values.append(coherencemodel.get_coherence())
#list_num_topics = [i for i in range(start, limit, step)]
#df_coherence1 = pd.DataFrame({'Number_of_Topics': list_num_topics, 'Coherence_Score': coherence_values})
#df_coherence1.to_pickle('./df_coherence1.pkl') #save the result to disk
#df_coherence = pd.read_pickle('./df_coherence1.pkl') #load the result from disk
#fig1 = px.line(df_coherence, x = 'Number_of_Topics', y = "Coherence_Score", title = 'Coherence scores against number of topics')
#fig1.update_layout(autosize=False, width=1000, height=400)
#fig1.update_traces(mode = "lines + markers")
#fig1.show()
# set num of topics to get the highest coherence socre
k = 10
lda_model = gensim.models.LdaModel(
corpus = corpus,
id2word = id2word,
alpha = alpha,
eta = eta,
iterations = iterations,
num_topics = k,
passes = passes,
random_state = random_state,
minimum_probability = minimum_probability)
# print the coherence of the LDA model
coherencemodel2 = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
coherence_score = coherencemodel2.get_coherence()
coherence_score
Now we can get the topic distribution of documents.
# create the function for converting a list of tuples into a dictionary
def Convert(tup, di):
di = dict(tup)
return di
# topic distribution of documents
list_topic = []
dictionary_topic = {}
for d in texts:
bow = id2word.doc2bow(d)
belong = lda_model[bow] # generate a list of tuples of topic distribution of a document
belong_dic = Convert(belong, dictionary_topic) # convert the list of tuples into a dictionary
list_topic.append(belong_dic)
df_topic_distribution = pd.DataFrame(list_topic) # convert the list of dictionaries into a dataframe
# rename the topic IDs to ensure they are as same as the topic IDs in the pyLDAvis
original_topic_id = [*df_topic_distribution]; new_topic_id = [x + 1 for x in original_topic_id]
df_topic_distribution = df_topic_distribution.rename(columns = dict(zip(original_topic_id, new_topic_id))) #rename the topic IDs to ensure they are as same as the topic IDs in the pyLDAvis
df_topic = pd.merge(df, df_topic_distribution, how = 'left', left_index=True, right_index=True) # merge with info of documents
df_topic.drop(['title','format','creator', 'producer', 'keywords', 'trapped', 'encryption','subject', 'modDate'], axis = 1)
The tools aim at assisting users to interpret the topics extracted above more efficiently and transparently. we first identify the salient topics defined by PTBI proposed by Marchetti and Puranam (2020), then combine both the topic visualisation of PyLDAvis and the prototypical texts defined by PTBI to facilitate the topic interpretation.
Not all topics can be easily interpreted because topic model is likely to produce more topics than the number a human reader can easily interpret, therefore, PTBI selects only the salient topics for interpretation. For each topic, we need to compute the fraction of documents with the probability that the documents belong to the topic is more than > 1/K (Marchetti and Puranam, 2020, p. 14), and I defined the fraction as the “salience” of the topic.
The scree plot below shows that when the topics are sorted by salience in descending order, the salience tends to reach a low level and level off on topic 6, as a result, we can select the topics ahead of topic 6 as the salient topics for interpretation.
# compute salience: the fraction of documents with the probability that the document belongs to the topic is more than > 1/K for each document
list_percent_above = []
for i in df_topic_distribution:
num_above = df_topic_distribution[i][df_topic_distribution[i] > 1/k].count()
percent_above = num_above/len(df_topic_distribution)
list_percent_above.append(percent_above)
df_salient_topic = pd.DataFrame({'topic_ID': [str(i) for i in new_topic_id], 'salience': list_percent_above}).sort_values(
by = 'salience', ascending = False)
fig_L1 = px.line(df_salient_topic, x = 'topic_ID', y = 'salience', title="Scree plot of salience of topics")
fig_L1.update_layout(autosize=False, width=800, height=400)
fig_L1.update_traces(mode = "lines + markers")
fig_L1.show()
I apply PyLDAvis to visualise the topics. The circles on the left panel represent the topics; their areas are proportional to the prevalence of the topics; the distance between topics indicates the similarity between topics. The words on the right panel are sorted by the relevance of the words in a topic, a novel measure for topic interpretation weighted by both the overall word frequency and the estimated word frequency in the topic. The λ on the right-top corner needs to be set to 0.6 to increase the interpretability.
Check the words of each topic, if there're common words with high overall frequency such as "think" "want" or "make", return to the "import the stop_words from gensim" section, add these words to the list of stop words to remove them.
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, sort_topics = False )
pyLDAvis.save_html(vis, './assets/lda.html') # save the reult to disk
vis
The prototypical paragraphs, the paragraphs with the highest probability that they belong to a topic, can be used to assist topic interpretation. This section classifies the paragraphs into topics and provides users with 4 types of filters to select the prototypical paragraphs: N most prototypical paragraphs overall, N most prototypical paragraphs where the belong() function is greater than the threshold L, N most prototypical paragraphs of each topic and N most prototypical paragraphs of a specific topic.
The documents will be separated into paragraphs based on the delimiters representing blank lines. Users can compare the original files to the parsed texts of the files to identify the correct delimiters.
# define the function for spliting documents into paragraphs by delimiters
def para_split(i):
j = parser.from_file(i)
m = j['content']
import re
return re.split('[?.!-]\n|[?.!-] \n| \n\n|\n\n[0-9]', m) # users can modify the delimiters
list_paragraphs = []
list_para_id = []
for i in pdf_files:
para = para_split(i)
para = [w.replace('\n', '') for w in para]
para = [x.strip() for x in para if x.strip()] # remove empty elements
para_id = [x for x in range(len(para))]
list_paragraphs.append(para)
list_para_id.append(para_id)
df_para1 = df.copy()
df_para1['paragraphs'] = list_paragraphs
df_para1['para_id'] = list_para_id
df_para2 = df_para1.apply(pd.Series.explode)
df_para3 = df_para2.reset_index()
df_para4 = df_para3[['creationDate', 'document_id', 'file_name', 'para_id', 'paragraphs']]
# print number of paragraphs extracted
len(df_para4)
The following chunks allow users to compare the original documents to the parsed texts to check whether the paragraphs are separated correctly; if not, they can modify the delimiters above to seperate the paragraphs again. Remove the hashes to activite the functions.
#df_para4['word_count'] = df_para4['paragraphs'].str.split().str.len()
#df_para4.sort_values(by = 'word_count', ascending = False) # sort the paragraphs by word count in descending order, check whether the word count is normal
#para_id = 7 # input the ibdex of the paragraph that might not be seperated correctly based on its word count
#df_para4.loc[para_id,'paragraphs'] # print the paragraph
#df_para4.loc[para_id,'file_name'] # get the file name of the paragraph
#doc = parser.from_file(r'D:\\LEON\\Business Analytics\\Study\\9. Business Project\\Data set\\Olympics\\Examination_of_Witnesses_Sept_2003_-_Q1-19.pdf') # input the file name
#doc_text = doc['content']
#doc_text # get the parsed text of the file
#print(doc_text) # get the original layout of the file, then you can compare it to the parsed text above to identify the correct delimiters
After the paragraphs are seperated, users can set a threshold to filter out the paragraphs with short length such as references.
# set a filter to filter out the paragraphs with short words
n_word_count = 10 # set the threshold of word count
para_word_count = df_para4['paragraphs'].str.split().str.len() # word count of each paragraph
df_para = df_para4[(para_word_count>=n_word_count)].reset_index() # select the paragraphs with word count not less than the threshold
df_para
The paragraphs are processed in the same manners that the documents are processed.
# tokenization
data2 = df_para[df_para.columns[5]].tolist()
data_words2 = list(sent_to_words(data2))
# Form Trigrams
data_words_trigrams2 = make_trigrams(data_words2)
# Do lemmatization keeping only noun, adj, vb
data_lemmatized2 = lemmatization(data_words_trigrams2, allowed_postags=['NOUN', 'ADJ', 'VERB'])
# set the length of word threshold as same as before for removing the words less than the threshold
data_lemmatized2_2 = []
for i in data_lemmatized2:
new_element = [x for x in i if len(x) >= minimum_len]
data_lemmatized2_2.append(new_element)
# Remove Stop Words
data_lemmatized2_1 = remove_stop_words(data_lemmatized2_2)
Now we fit the paragraphs to the trained LDA model, and the paragraphs will be classified based on the probability that the paragraphs belong to the topics. Users can drop the meaningless paragraphs after examining the prototypical paragraphs in the next section.
# belong function: classify topics of paragraphs, it might take a long time because there are 148,651 paragraphs in the 11,132,849-word corpus
list_topic_para = []
dictionary_topic_para = {}
for d in data_lemmatized2_1:
bow = id2word.doc2bow(d)
belong = lda_model[bow]
doc_dic = Convert(belong, dictionary_topic_para)
list_topic_para.append(doc_dic)
df_topic_para = pd.DataFrame(list_topic_para)
# rename the topic IDs to ensure they are as same as the topic IDs in the pyLDAvis
df_topic_para = df_topic_para.rename(columns = dict(zip(original_topic_id, new_topic_id)))
# topic distribution of paragraphs
df_topic_para1_1 = pd.merge(df_para, df_topic_para, how = 'left', left_index=True, right_index=True)
df_topic_para1_1
# save the result to disk
df_topic_para1_1.to_pickle('./df_topic_para_Olympics.pkl')
# load the result from disk
df_topic_para1 = pd.read_pickle('./df_topic_para_Olympics.pkl')
# drop the paragraphs with high frequency but meaningless for interperation based on the extraction of prototypical paragraphs below
list_remove_para = [7622, 12966] # input the indices of the paragraphs you want to drop after examing the prototypical paragraphs in the next setion
df_topic_para2 = df_topic_para1.copy().drop(list_remove_para)
df_topic_para2.to_pickle('./df_topic_para_Olympics2.pkl') # save the resuilt to disk
# print topic distribution of paragraphs
df_topic_para2
Print the N paragraphs with the highest probability that they belong to a topic among the corpus.
#N most prototypical paragraphs overall
df_topic_para2_n = df_topic_para2.copy()
df_topic_para2_n['highest_p'] = df_topic_para2_n.iloc[:, 6:].max(axis = 1) # get the highest probability among the topic distribution of each paragraph
df_topic_para2_n['salient_topic'] = df_topic_para2_n.iloc[:, 6:].idxmax(axis = 1) # get the corresponding topic id
df_topic_para2_n = df_topic_para2_n[['index','file_name','salient_topic','paragraphs','highest_p',]]
df_topic_para2_n.columns = ['Index','file','topic', 'paragraph','probability']
N1 = 5 # Set N to get the N most prototypical paragraphs overall
df_topic_para2_n.nlargest(N1,['probability']).style.set_properties(subset = ['paragraph'], **{'width':'1000px', 'length': '50px'})
For each topic, print the N paragraphs with the highest probability that they belong to the topic and the probability should be not less than a threshold.
I followed the method of extraction of prototypical text suggested by PTBI (Marchetti and Puranam, 2020. p. 14). PTBI attempts to not only extract the prototypical documents to improve interpretability, but also to find the minimum number of prototypical documents for topic interpretation. The algorithm is shown as follows:
List_num_doc = [x for x in range(1, 20, 1)] # generate a list of 1/L (minimum number of documents to interpret a topic)
list_L = [1/x for x in List_num_doc] # generate a list of L
# define the function for computing the percentage of potentially interpretable topics against parameter L
def perc(i, df):
list_num_topics = []
for j in df:
topic_filter = df[j] >= i
m = df[j][topic_filter].count()
list_num_topics.append(m)
count1 = sum(map(lambda x : x >= 1/i, list_num_topics))
perc1 = count1 / k
return(perc1)
The plot shows that when L = 0.333, the percentage of interpretable topics is 100%, so I set L to be 0.333 - ie, each topic needs at least 3 (1/0.333) paragraphs with the probability that they belong to the topic is no less than 3 for interpretation. It is worth noting that L is inversely proportional to the minimum number of paragraphs of each topic for interpretation (1/L), in other words, the lower the threshold L is, the more paragraphs that users need to interpret the topics. Although when L = 0.1 the percentage of interpretable topics is also 100%, the minimum number of paragraphs of each topic for interpretation also rises to 10 (1/0.1), which increases the workload of interpretation significantly.
list_perc2 = []
for i in list_L:
num = perc(i, df_topic_para2.drop(columns = ['index', 'creationDate', 'document_id', 'file_name', 'para_id', 'paragraphs']))
list_perc2.append(num)
df_L2 = pd.DataFrame({'Threshold_L': list_L, 'Percentage of interpretable topics': list_perc2})
fig_L2 = px.line(df_L2, x = 'Threshold_L', y="Percentage of interpretable topics", title = 'Percentage of interpretable topics')
fig_L2.update_layout(autosize=False, width=800, height=400)
fig_L2.update_traces(mode = "lines + markers")
fig_L2.show()
# define the function for extracting the highest N ranked paragraphs from each topic
def top_n_filter(df, top_n):
list_topic_id = [x+1 for x in range(0,k)]
list_n_para = []
list_n_p = []
list_n_index = []
list_n_file_name = []
for x in range(1, k + 1):
n_para = [i for i in df.nlargest(top_n, [x])['paragraphs']]
n_p = [i for i in df.nlargest(top_n, [x])[x]]
n_index = [i for i in df_topic_para2.nlargest(top_n, [x]).index]
n_file_name = [i for i in df.nlargest(top_n, [x])['file_name']]
list_n_para.append(n_para)
list_n_p.append(n_p)
list_n_file_name.append(n_file_name)
list_n_index.append(n_index)
pd_n_para = pd.DataFrame({'Index':list_n_index, 'topic_id': list_topic_id, 'file': list_n_file_name, 'paragraph': list_n_para, 'probability': list_n_p})
return(pd_n_para.apply(pd.Series.explode).reset_index().drop('index', axis = 1))
Below we get the the 3 most prototypical paragraphs of each topic when we set the optimal L to be 0.333.
L = 1/3 # set the optimal L based on the analysis above
top_n_filter(df_topic_para2, int(1/L))[top_n_filter(df_topic_para2, int(1/L))['probability'] >= L].style.set_properties(subset = ['paragraph'], **{'width':'500px', 'length': '50px'})
For each topic, print the N paragraphs with the highest probability that they belong to the topic.
# 2 most prototypical paragraphs of each topic
N2 = 2
top_n_filter(df_topic_para2, N2).style.set_properties(subset = ['paragraph'], **{'width':'500px', 'length': '50px'})
Select a topic, print the N paragraphs with the highest probability that they belong to the topic.
topic_id_chosen = 2 # choose the topic ID
num_para = 2 # set N to extract the N most prototypical paragraphs of a specific topic
df_n_topic_k = top_n_filter(df_topic_para2, num_para)
topic_id_filter = df_n_topic_k['topic_id'] == topic_id_chosen
df_n_topic_k[topic_id_filter].style.set_properties(subset = ['paragraph'], **{'width':'500px', 'length': '50px'})
Below the visualisation of PyLDAvis and the prototypical paragraphs are integrated into a dashboard, users can click the link generated to open the dashboard and explore the topics more easily. To launch the dash, remember to download the two css files from https://github.com/suhao3123/CSS, create a folder named assets in the root of your app directory and include the two files in that folder. After the first run of the whole program, users can run the chunks below independently. If you get the error "Address 'http://127.0.0.1:8092' already in use. Try passing a different port to run_server.", assign a different value to "port= ".
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from jupyter_dash import JupyterDash
import dash
import dash_table
import dash_core_components as dcc
import dash_html_components as html
import dash_bootstrap_components as dbc
from dash_table.Format import Format, Scheme, Trim
from dash.dependencies import Input, Output, State
from dash.exceptions import PreventUpdate
# load the topic distribution of paragraphs from disk
df_topic_para3 = pd.read_pickle('./df_topic_para_Olympics2.pkl')
df_topic_para3_n = df_topic_para3.copy()
df_topic_para3_n['highest_p'] = df_topic_para3_n.iloc[:, 6:].max(axis = 1)
df_topic_para3_n['salient_topic'] = df_topic_para3_n.iloc[:, 6:].idxmax(axis = 1)
df_topic_para3_n = df_topic_para3_n[['index','file_name','salient_topic','paragraphs','highest_p',]]
df_topic_para3_n.columns = ['Index','file','topic', 'paragraph','probability']
# define the function for extracting the highest N ranked paragraphs from each topic
def top_n_filter(df, top_n):
list_topic_id = [x+1 for x in range(0,k)]
list_n_para = []
list_n_p = []
list_n_index = []
list_n_file_name = []
for x in range(1, k + 1):
n_para = [i for i in df.nlargest(top_n, [x])['paragraphs']]
n_p = [i for i in df.nlargest(top_n, [x])[x]]
n_index = [i for i in df_topic_para3.nlargest(top_n, [x]).index]
n_file_name = [i for i in df.nlargest(top_n, [x])['file_name']]
list_n_para.append(n_para)
list_n_p.append(n_p)
list_n_file_name.append(n_file_name)
list_n_index.append(n_index)
pd_n_para = pd.DataFrame({'Index':list_n_index, 'topic_id': list_topic_id, 'file': list_n_file_name, 'paragraph': list_n_para, 'probability': list_n_p})
return(pd_n_para.apply(pd.Series.explode).reset_index().drop('index', axis = 1))
list_mark = list(np.arange(0,1.050,0.050))
list_mark_round = [round(i, 2) for i in list_mark]
marks= {x: str(x) for x in list_mark_round}
# Set up the app
external_stylesheets = [dbc.themes.BOOTSTRAP, "assets/bootstrap.min.css"]
app = JupyterDash(__name__, external_stylesheets=external_stylesheets)
# Bootstrap's cards provide a flexible content container with multiple variants and options.
pyLDAcard = dbc.Card(
[
dbc.CardHeader(html.H4("Topic visualisation")), # title
dbc.CardBody(
[
dbc.Row(
dbc.Col(
[
html.Embed(src = "assets/lda.html" ,style={ 'position': 'relative', 'left': '-250px', 'top': '-100px',
'width':'1400px', 'height':'860px', 'transform': 'scale(0.70)'}),
]
)
)
]
),
]
)
table_card = dbc.Card(
[
dbc.CardHeader(
dbc.Row([
dbc.Col(html.H4("Prototypical paragraphs"))
])
),
dbc.CardHeader(
dbc.Row(
[
dbc.Col(
[
html.H6("Threshold of probability "),
dcc.Slider(
id='slider',
min=0,
max=1,
step=0.01,
marks=marks,
value=0.05,
),html.Div(style={'width': '1000px'})
]
),
dbc.Col(
[
html.H6("Topic ID"),
dcc.Input(id="topic_selection", type="number",min=1, max=100, step=1, value=1),
html.Div(style={'width': '100px'})
]
),
dbc.Col(
[
html.H6("Number of paragraphs"),
dcc.Input(id="rank_selection", type="number",min=1, max=1000, step=1,value=5),
html.Div(style={'width': '100px'})
]
),
dbc.Col(
[
html.H6("Mode"),
dcc.Dropdown(
id='dropdown',
options=[
{'label': 'N most prototypical paragraphs for topic K', 'value': 'c1'},
{'label': 'N most prototypical paragraphs overall', 'value': 'c2'},
{'label': 'N most prototypical paragraphs for each topic', 'value': 'c3'}
],
# value = 'c1',
searchable=False,
clearable=False,
placeholder="Select a mode",
),html.Div(style={'width': '380px'})
]
),
]
)
),
dbc.CardBody(
dbc.Col([
dash_table.DataTable(),html.Div(id="data_table")
])
),
dbc.CardFooter(
dbc.Row([
dbc.Col(
[
html.H6('Please click the "Submit" button after setting the parameters above'),html.Div(style={'width': '500px'})
]
),
dbc.Col(
[
dbc.Button("Submit", id='submit', color="success"),
html.Div(id='button')
]
)
])
)
]
)
app.layout = html.Div(
[
dbc.Container(
[dbc.Row(
[
dbc.Col(pyLDAcard,md=7),
dbc.Col(table_card,md=5)
]
)
],
fluid=True,
),
]
)
@app.callback(
Output('data_table','children'),
Input('submit', 'n_clicks'), Input('dropdown', 'value'), Input('slider', 'value'), Input('topic_selection','value'), Input('rank_selection','value')
)
def update_datatable(n_clicks, dropdown_value, slider_value,topic_value,top_n):
ctx = dash.callback_context
if not ctx.triggered:
button_id = 'No clicks'
else:
button_id = ctx.triggered[0]['prop_id'].split('.')[0]
# print(button_id)
if button_id=="submit":
topic = topic_value #Topic filter of the Highest ranked paragraphs
Top_N = top_n #Set rank of for for topic
# print(topic_value)
# print(Top_N)
minimum_probability = slider_value #Topics with an assigned probability lower than this threshold will be discarded.
# print(minimum_probability)
if dropdown_value=='c1':
c_df = top_n_filter(df_topic_para3, Top_N)[top_n_filter(df_topic_para3, Top_N)['topic_id'] == topic][top_n_filter(df_topic_para3, Top_N)['probability'] >= minimum_probability]
elif dropdown_value=='c2':
c_df = df_topic_para3_n.nlargest(Top_N,['probability'])[df_topic_para3_n['probability'] >= minimum_probability]
elif dropdown_value=='c3':
c_df = top_n_filter(df_topic_para3, Top_N)[top_n_filter(df_topic_para3, Top_N)['probability'] >= minimum_probability]
else:
return None
# print(dropdown_value)
table = dash_table.DataTable(
id="table-line-1",
columns=[
dict(id=c_df.columns[0], name=c_df.columns[0]),
dict(id=c_df.columns[1], name=c_df.columns[1]),
dict(id=c_df.columns[2], name=c_df.columns[2]),
dict(id=c_df.columns[3], name=c_df.columns[3]),
dict(id=c_df.columns[4], name=c_df.columns[4], type='numeric', format=Format(precision=2, scheme=Scheme.fixed)),
],
data=c_df.to_dict("records"),
# page_action='none',
page_size=5,
style_table={'height': '1000px', 'overflowY': 'auto'},
fixed_rows={'headers': True},
style_header={ 'border': '1px solid black', 'fontWeight': 'bold','textAlign': 'center', 'fontSize':'1px'},
style_cell={ 'fontSize':'10px','border': '1px solid grey','minWidth': 10, 'maxWidth': 30, 'width': 30,'whiteSpace': 'normal',
'height': 'auto', 'lineHeight': '15px','textAlign': 'center','textOverflow': 'ellipsis', 'maxWidth': 0},
css=[{
'selector': '.dash-spreadsheet td div',
'rule': '''
line-height: 15px;
max-height: 300px; min-height: 50px; height: 300px;
display: block;
overflow-y: hidden;
'''
}],
style_cell_conditional=[
{'if': {'column_id': 'Index'},'width': '5%'},
{'if': {'column_id': 'file'},'width': '10%' },
{'if': {'column_id': 'topic_id'},'width': '5%' },
{'if': {'column_id': 'paragraph'},'width': '75%','textAlign': 'left'},
{'if': {'column_id': 'probability'},'width': '5%'},
],
style_as_list_view=True,
)
# print('end')
return table
app.run_server(mode = 'external', port=8092)
import warnings
warnings.filterwarnings('ignore')
# remove the hash below and run the chunk to terminate the Dash
#app._terminate_server_for_port('localhost', 8092)